__author__ = 'Administrator'

readfile_all = open("trec_10.label.txt", 'r')


# step 1, get the label index
print "step 1"
index = 0
label_dict = dict()
for each in readfile_all:
    words = each.lstrip().lstrip().strip("\n").split(" ")
    label = words[0].lstrip().lstrip().split(":")[0]
    if label not in label_dict:
        label_dict[label] = index
        index += 1

# step 2, write the train data
print "step 2----> write the train data"
readfile_all.close()
readfile = open("train_5500.label.txt", 'r')
writefile = open("trec_train.txt", 'w')
for each in readfile:
    words = each.lstrip().lstrip().strip("\n").split(" ")
    label = words[0].lstrip().lstrip().split(":")[0]
    index = label_dict[label]
    line = str(index) + "@"
    for i in xrange(1, len(words)):
        line = line + words[i] + "@"
    writefile.write(line[0:-1] + "\n")
writefile.close()

# step 3, write the test data
print "step 3----> write the test data"
readfile_all.close()
readfile = open("trec_10.label.txt", 'r')
writefile = open("trec_test.txt", 'w')
for each in readfile:
    words = each.lstrip().lstrip().strip("\n").split(" ")
    label = words[0].lstrip().lstrip().split(":")[0]
    index = label_dict[label]
    line = str(index) + "@"
    for i in xrange(1, len(words)):
        line = line + words[i] + "@"
    writefile.write(line[0:-1] + "\n")
writefile.close()